import numpy as np
import pandas as pd
import math
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn import tree
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from time import time
from sklearn.metrics import r2_score
import os
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data = pd.read_csv('/content/drive/MyDrive/kc_house_data.csv')
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
data.head(20)
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
| 5 | 7237550310 | 20140512T000000 | 1225000.0 | 4 | 4.50 | 5420 | 101930 | 1.0 | 0 | 0 | 3 | 11 | 3890 | 1530 | 2001 | 0 | 98053 | 47.6561 | -122.005 | 4760 | 101930 |
| 6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 47.3097 | -122.327 | 2238 | 6819 |
| 7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 47.4095 | -122.315 | 1650 | 9711 |
| 8 | 2414600126 | 20150415T000000 | 229500.0 | 3 | 1.00 | 1780 | 7470 | 1.0 | 0 | 0 | 3 | 7 | 1050 | 730 | 1960 | 0 | 98146 | 47.5123 | -122.337 | 1780 | 8113 |
| 9 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 47.3684 | -122.031 | 2390 | 7570 |
| 10 | 1736800520 | 20150403T000000 | 662500.0 | 3 | 2.50 | 3560 | 9796 | 1.0 | 0 | 0 | 3 | 8 | 1860 | 1700 | 1965 | 0 | 98007 | 47.6007 | -122.145 | 2210 | 8925 |
| 11 | 9212900260 | 20140527T000000 | 468000.0 | 2 | 1.00 | 1160 | 6000 | 1.0 | 0 | 0 | 4 | 7 | 860 | 300 | 1942 | 0 | 98115 | 47.6900 | -122.292 | 1330 | 6000 |
| 12 | 114101516 | 20140528T000000 | 310000.0 | 3 | 1.00 | 1430 | 19901 | 1.5 | 0 | 0 | 4 | 7 | 1430 | 0 | 1927 | 0 | 98028 | 47.7558 | -122.229 | 1780 | 12697 |
| 13 | 6054650070 | 20141007T000000 | 400000.0 | 3 | 1.75 | 1370 | 9680 | 1.0 | 0 | 0 | 4 | 7 | 1370 | 0 | 1977 | 0 | 98074 | 47.6127 | -122.045 | 1370 | 10208 |
| 14 | 1175000570 | 20150312T000000 | 530000.0 | 5 | 2.00 | 1810 | 4850 | 1.5 | 0 | 0 | 3 | 7 | 1810 | 0 | 1900 | 0 | 98107 | 47.6700 | -122.394 | 1360 | 4850 |
| 15 | 9297300055 | 20150124T000000 | 650000.0 | 4 | 3.00 | 2950 | 5000 | 2.0 | 0 | 3 | 3 | 9 | 1980 | 970 | 1979 | 0 | 98126 | 47.5714 | -122.375 | 2140 | 4000 |
| 16 | 1875500060 | 20140731T000000 | 395000.0 | 3 | 2.00 | 1890 | 14040 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 1994 | 0 | 98019 | 47.7277 | -121.962 | 1890 | 14018 |
| 17 | 6865200140 | 20140529T000000 | 485000.0 | 4 | 1.00 | 1600 | 4300 | 1.5 | 0 | 0 | 4 | 7 | 1600 | 0 | 1916 | 0 | 98103 | 47.6648 | -122.343 | 1610 | 4300 |
| 18 | 16000397 | 20141205T000000 | 189000.0 | 2 | 1.00 | 1200 | 9850 | 1.0 | 0 | 0 | 4 | 7 | 1200 | 0 | 1921 | 0 | 98002 | 47.3089 | -122.210 | 1060 | 5095 |
| 19 | 7983200060 | 20150424T000000 | 230000.0 | 3 | 1.00 | 1250 | 9774 | 1.0 | 0 | 0 | 4 | 7 | 1250 | 0 | 1969 | 0 | 98003 | 47.3343 | -122.306 | 1280 | 8850 |
data.describe()
| id | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.161300e+04 | 2.161300e+04 | 21613.000000 | 21613.000000 | 21613.000000 | 2.161300e+04 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 |
| mean | 4.580302e+09 | 5.400881e+05 | 3.370842 | 2.114757 | 2079.899736 | 1.510697e+04 | 1.494309 | 0.007542 | 0.234303 | 3.409430 | 7.656873 | 1788.390691 | 291.509045 | 1971.005136 | 84.402258 | 98077.939805 | 47.560053 | -122.213896 | 1986.552492 | 12768.455652 |
| std | 2.876566e+09 | 3.671272e+05 | 0.930062 | 0.770163 | 918.440897 | 4.142051e+04 | 0.539989 | 0.086517 | 0.766318 | 0.650743 | 1.175459 | 828.090978 | 442.575043 | 29.373411 | 401.679240 | 53.505026 | 0.138564 | 0.140828 | 685.391304 | 27304.179631 |
| min | 1.000102e+06 | 7.500000e+04 | 0.000000 | 0.000000 | 290.000000 | 5.200000e+02 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 290.000000 | 0.000000 | 1900.000000 | 0.000000 | 98001.000000 | 47.155900 | -122.519000 | 399.000000 | 651.000000 |
| 25% | 2.123049e+09 | 3.219500e+05 | 3.000000 | 1.750000 | 1427.000000 | 5.040000e+03 | 1.000000 | 0.000000 | 0.000000 | 3.000000 | 7.000000 | 1190.000000 | 0.000000 | 1951.000000 | 0.000000 | 98033.000000 | 47.471000 | -122.328000 | 1490.000000 | 5100.000000 |
| 50% | 3.904930e+09 | 4.500000e+05 | 3.000000 | 2.250000 | 1910.000000 | 7.618000e+03 | 1.500000 | 0.000000 | 0.000000 | 3.000000 | 7.000000 | 1560.000000 | 0.000000 | 1975.000000 | 0.000000 | 98065.000000 | 47.571800 | -122.230000 | 1840.000000 | 7620.000000 |
| 75% | 7.308900e+09 | 6.450000e+05 | 4.000000 | 2.500000 | 2550.000000 | 1.068800e+04 | 2.000000 | 0.000000 | 0.000000 | 4.000000 | 8.000000 | 2210.000000 | 560.000000 | 1997.000000 | 0.000000 | 98118.000000 | 47.678000 | -122.125000 | 2360.000000 | 10083.000000 |
| max | 9.900000e+09 | 7.700000e+06 | 33.000000 | 8.000000 | 13540.000000 | 1.651359e+06 | 3.500000 | 1.000000 | 4.000000 | 5.000000 | 13.000000 | 9410.000000 | 4820.000000 | 2015.000000 | 2015.000000 | 98199.000000 | 47.777600 | -121.315000 | 6210.000000 | 871200.000000 |
years = []
for i in range(data.shape[0]):
years.append(data.date[i][0:4])
data['years'] = [int(i) for i in years]
data.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 | 2014 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 | 2014 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 | 2015 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 | 2014 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 | 2015 |
data['age'] = data['years'] - data['yr_built']
data.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | years | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 | 2014 | 59 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 | 2014 | 63 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 | 2015 | 82 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 | 2014 | 49 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 | 2015 | 28 |
updated_age = []
for i in range(data.shape[0]):
if data.yr_renovated[i] == 0:
temp = data.age[i]
else:
temp = data.years[i] - data.yr_renovated[i]
updated_age.append(temp)
data['updated_age'] = updated_age
data.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | years | age | updated_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 | 2014 | 59 | 59 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 | 2014 | 63 | 23 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 | 2015 | 82 | 82 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 | 2014 | 49 | 49 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 | 2015 | 28 | 28 |
data.dtypes
id int64 date object price float64 bedrooms int64 bathrooms float64 sqft_living int64 sqft_lot int64 floors float64 waterfront int64 view int64 condition int64 grade int64 sqft_above int64 sqft_basement int64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 sqft_living15 int64 sqft_lot15 int64 years int64 age int64 updated_age int64 dtype: object
data = data.drop(['id','date', 'years', 'yr_built', 'yr_renovated', 'zipcode'], axis = 1)
#Check any number of columns with NaN or missing values
empty_col = data.isnull().any().sum()
print('total no. of columns which have any Null value : {}'.format(empty_col))
total no. of columns which have any Null value : 0
# Check any number of data points with NaN
empty_rows = data.isnull().any(axis=1).sum()
print('total no. of rows which have any Null value : {}'.format(empty_rows))
total no. of rows which have any Null value : 0
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
data.head()
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | lat | long | sqft_living15 | sqft_lot15 | age | updated_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 47.5112 | -122.257 | 1340 | 5650 | 59 | 59 |
| 1 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 47.7210 | -122.319 | 1690 | 7639 | 63 | 23 |
| 2 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | 3 | 6 | 770 | 0 | 47.7379 | -122.233 | 2720 | 8062 | 82 | 82 |
| 3 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 47.5208 | -122.393 | 1360 | 5000 | 49 | 49 |
| 4 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 47.6168 | -122.045 | 1800 | 7503 | 28 | 28 |
sns.pairplot(data)
plt.show()
data.plot(kind='scatter', x = 'sqft_living', y = 'price');
plt.show()
data.plot(kind='scatter', y = 'price', x = 'bathrooms');
plt.show()
data.plot(kind='scatter', y = 'price', x = 'updated_age');
plt.show()
data.plot(kind='scatter', y = 'price', x = 'age');
plt.show()
data.plot(kind='scatter', y = 'price', x = 'condition');
plt.show()
h = data.hist(bins=25,figsize=(16,16),xlabelsize='10',ylabelsize='10',xrot=-15)
sns.countplot(data.bedrooms, order = data['bedrooms'].value_counts().index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e29641610>
sns.countplot(data.view, order = data['view'].value_counts().index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e27313e50>
sns.countplot(data.waterfront, order = data['waterfront'].value_counts().index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e27778550>
sns.countplot(data.grade, order = data['grade'].value_counts().index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e3d713510>
sns.countplot(data.age, order = data['age'].value_counts().index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e27689790>
data.columns
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'age',
'updated_age'],
dtype='object')
f, axe = plt.subplots(1, 1,figsize=(10,5))
sns.boxplot(x=data['bedrooms'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(15,5))
sns.boxplot(x=data['bathrooms'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['floors'],y=data['price'],ax=axe)
'''f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['sqft_living'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['sqft_lot'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['waterfront'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['view'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['condition'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['grade'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['sqft_above'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['sqft_basement'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['lat'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['long'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['sqft_living15'],y=data['price'],ax=axe)
f, axe = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=data['sqft_lot15'],y=data['price'],ax=axe)'''
"f, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['sqft_living'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['sqft_lot'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['waterfront'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['view'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['condition'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['grade'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['sqft_above'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['sqft_basement'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['lat'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['long'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['sqft_living15'],y=data['price'],ax=axe)\nf, axe = plt.subplots(1, 1,figsize=(12,5))\nsns.boxplot(x=data['sqft_lot15'],y=data['price'],ax=axe)"
data_lot = data[data['sqft_lot'] < data['sqft_living']]
data = data.drop(list(data_lot.index), axis = 0)
data1 = data.copy()
import numpy as np
bed_per = np.percentile(data1.bedrooms, np.arange(0,101,10))
for i in range(len(bed_per)):
print("{} percentile value is {}".format(i*10, bed_per[i] ) )
0 percentile value is 0.0 10 percentile value is 2.0 20 percentile value is 3.0 30 percentile value is 3.0 40 percentile value is 3.0 50 percentile value is 3.0 60 percentile value is 4.0 70 percentile value is 4.0 80 percentile value is 4.0 90 percentile value is 4.0 100 percentile value is 33.0
bed_per = np.percentile(data1.bedrooms, np.arange(90,101,1))
for i in range(len(bed_per)):
print("{} percentile value is {}".format(i, bed_per[i] ) )
0 percentile value is 4.0 1 percentile value is 5.0 2 percentile value is 5.0 3 percentile value is 5.0 4 percentile value is 5.0 5 percentile value is 5.0 6 percentile value is 5.0 7 percentile value is 5.0 8 percentile value is 5.0 9 percentile value is 6.0 10 percentile value is 33.0
for i in np.arange(0.0, 1.0, 0.1):
var = data1["bedrooms"].values
var = np.sort(var,axis = None)
print("{} percentile value is {}".format(99+i,var[int(len(var)*(float(99+i)/100))]))
print("100 percentile value is ",var[-1])
99.0 percentile value is 6 99.1 percentile value is 6 99.2 percentile value is 6 99.3 percentile value is 6 99.4 percentile value is 6 99.5 percentile value is 6 99.6 percentile value is 6 99.7 percentile value is 6 99.8 percentile value is 7 99.9 percentile value is 8 100 percentile value is 33
data[data.bedrooms == 8]
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | lat | long | sqft_living15 | sqft_lot15 | age | updated_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4035 | 2150000.0 | 8 | 6.00 | 4340 | 9415 | 2.0 | 0 | 0 | 3 | 8 | 4340 | 0 | 47.6316 | -122.202 | 2050 | 9100 | 47 | 47 |
| 4067 | 373000.0 | 8 | 3.00 | 2850 | 12714 | 1.0 | 0 | 0 | 3 | 7 | 2850 | 0 | 47.4859 | -122.205 | 1480 | 4942 | 56 | 56 |
| 6174 | 340000.0 | 8 | 2.75 | 2790 | 6695 | 1.0 | 0 | 0 | 3 | 7 | 1470 | 1320 | 47.7565 | -122.331 | 1760 | 7624 | 37 | 37 |
| 9077 | 700000.0 | 8 | 2.50 | 2280 | 3000 | 1.5 | 0 | 0 | 3 | 7 | 1210 | 1070 | 47.6675 | -122.316 | 1610 | 3000 | 104 | 104 |
| 9452 | 900000.0 | 8 | 4.00 | 4020 | 7500 | 1.0 | 0 | 0 | 3 | 8 | 2010 | 2010 | 47.6732 | -122.363 | 1560 | 3737 | 46 | 46 |
| 10958 | 1650000.0 | 8 | 2.75 | 4040 | 20666 | 1.0 | 0 | 0 | 4 | 9 | 2020 | 2020 | 47.6340 | -122.221 | 3670 | 20500 | 52 | 52 |
| 12885 | 808000.0 | 8 | 3.75 | 3460 | 4600 | 2.0 | 0 | 0 | 3 | 7 | 2860 | 600 | 47.6617 | -122.289 | 2170 | 3750 | 27 | 27 |
| 15070 | 430000.0 | 8 | 3.25 | 4300 | 10441 | 2.0 | 0 | 0 | 4 | 8 | 2800 | 1500 | 47.4786 | -122.131 | 1780 | 10457 | 35 | 35 |
| 15670 | 680000.0 | 8 | 2.75 | 2530 | 4800 | 2.0 | 0 | 0 | 4 | 7 | 1390 | 1140 | 47.6241 | -122.305 | 1540 | 4800 | 113 | 113 |
| 17235 | 1970000.0 | 8 | 3.50 | 4440 | 6480 | 2.0 | 0 | 3 | 5 | 10 | 3140 | 1300 | 47.6310 | -122.303 | 4440 | 8640 | 55 | 55 |
| 18477 | 3300000.0 | 8 | 4.00 | 7710 | 11750 | 3.5 | 0 | 0 | 5 | 12 | 6090 | 1620 | 47.6263 | -122.314 | 4210 | 8325 | 110 | 110 |
| 19302 | 575000.0 | 8 | 3.00 | 3840 | 15990 | 1.0 | 0 | 0 | 3 | 7 | 2530 | 1310 | 47.7111 | -122.211 | 1380 | 8172 | 53 | 53 |
data.drop(data.loc[data['bedrooms']>= 8].index, inplace=True)
data2 = data.copy()
scaler = MinMaxScaler()
arr_scaled = scaler.fit_transform(data2)
norm_data = pd.DataFrame(arr_scaled, columns = data2.columns , index=data2.index)
plt.figure(figsize = (30, 25))
sns.heatmap(norm_data.corr(), annot = True, cmap="YlGnBu")
plt.show()
data = data.drop(['age','sqft_above'], axis = 1)
data.columns
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
'waterfront', 'view', 'condition', 'grade', 'sqft_basement', 'lat',
'long', 'sqft_living15', 'sqft_lot15', 'updated_age'],
dtype='object')
data.shape
(20801, 16)
#feature - Sqft_living
x = data.iloc[:,[1]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.100 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.100 |
| Method: | Least Squares | F-statistic: | 2300. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:27:33 | Log-Likelihood: | -2.9516e+05 |
| No. Observations: | 20801 | AIC: | 5.903e+05 |
| Df Residuals: | 20799 | BIC: | 5.903e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 9.777e+04 | 9556.675 | 10.230 | 0.000 | 7.9e+04 | 1.16e+05 |
| x1 | 1.309e+05 | 2730.067 | 47.962 | 0.000 | 1.26e+05 | 1.36e+05 |
| Omnibus: | 18202.324 | Durbin-Watson: | 1.962 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1110145.551 |
| Skew: | 3.939 | Prob(JB): | 0.00 |
| Kurtosis: | 37.912 | Cond. No. | 14.8 |
##feature - bathrooms
x = data.iloc[:,[2]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.283 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.283 |
| Method: | Least Squares | F-statistic: | 8221. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:27:36 | Log-Likelihood: | -2.9279e+05 |
| No. Observations: | 20801 | AIC: | 5.856e+05 |
| Df Residuals: | 20799 | BIC: | 5.856e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 3826.3574 | 6310.856 | 0.606 | 0.544 | -8543.413 | 1.62e+04 |
| x1 | 2.557e+05 | 2819.997 | 90.670 | 0.000 | 2.5e+05 | 2.61e+05 |
| Omnibus: | 16563.116 | Durbin-Watson: | 1.965 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 838166.591 |
| Skew: | 3.438 | Prob(JB): | 0.00 |
| Kurtosis: | 33.328 | Cond. No. | 7.66 |
#feature - bathrooms
x = data.iloc[:,[3]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.494 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.494 |
| Method: | Least Squares | F-statistic: | 2.030e+04 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:27:39 | Log-Likelihood: | -2.8917e+05 |
| No. Observations: | 20801 | AIC: | 5.783e+05 |
| Df Residuals: | 20799 | BIC: | 5.784e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -5.066e+04 | 4536.592 | -11.167 | 0.000 | -5.96e+04 | -4.18e+04 |
| x1 | 282.1473 | 1.980 | 142.493 | 0.000 | 278.266 | 286.028 |
| Omnibus: | 14336.657 | Durbin-Watson: | 1.973 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 522191.893 |
| Skew: | 2.846 | Prob(JB): | 0.00 |
| Kurtosis: | 26.877 | Cond. No. | 5.68e+03 |
#feature - view
x = data.iloc[:,[4]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.008 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.008 |
| Method: | Least Squares | F-statistic: | 167.9 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 3.02e-38 |
| Time: | 05:27:42 | Log-Likelihood: | -2.9617e+05 |
| No. Observations: | 20801 | AIC: | 5.923e+05 |
| Df Residuals: | 20799 | BIC: | 5.924e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 5.286e+05 | 2730.810 | 193.579 | 0.000 | 5.23e+05 | 5.34e+05 |
| x1 | 0.7874 | 0.061 | 12.957 | 0.000 | 0.668 | 0.906 |
| Omnibus: | 18377.587 | Durbin-Watson: | 1.965 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1089643.872 |
| Skew: | 4.014 | Prob(JB): | 0.00 |
| Kurtosis: | 37.536 | Cond. No. | 4.79e+04 |
#feature - bedrooms
x = data.iloc[:,[5]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.082 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.082 |
| Method: | Least Squares | F-statistic: | 1867. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:27:44 | Log-Likelihood: | -2.9536e+05 |
| No. Observations: | 20801 | AIC: | 5.907e+05 |
| Df Residuals: | 20799 | BIC: | 5.907e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 2.342e+05 | 7514.033 | 31.172 | 0.000 | 2.19e+05 | 2.49e+05 |
| x1 | 2.105e+05 | 4871.800 | 43.205 | 0.000 | 2.01e+05 | 2.2e+05 |
| Omnibus: | 18626.504 | Durbin-Watson: | 1.981 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1222838.777 |
| Skew: | 4.065 | Prob(JB): | 0.00 |
| Kurtosis: | 39.672 | Cond. No. | 6.53 |
#features - sqft_basement
x = data.iloc[:,[6]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.072 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.072 |
| Method: | Least Squares | F-statistic: | 1611. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:27:46 | Log-Likelihood: | -2.9547e+05 |
| No. Observations: | 20801 | AIC: | 5.910e+05 |
| Df Residuals: | 20799 | BIC: | 5.910e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 5.322e+05 | 2485.869 | 214.086 | 0.000 | 5.27e+05 | 5.37e+05 |
| x1 | 1.138e+06 | 2.83e+04 | 40.143 | 0.000 | 1.08e+06 | 1.19e+06 |
| Omnibus: | 17006.289 | Durbin-Watson: | 1.959 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 873529.323 |
| Skew: | 3.584 | Prob(JB): | 0.00 |
| Kurtosis: | 33.927 | Cond. No. | 11.4 |
#features - waterfront
x = data.iloc[:,[7]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.158 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.158 |
| Method: | Least Squares | F-statistic: | 3905. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:27:49 | Log-Likelihood: | -2.9446e+05 |
| No. Observations: | 20801 | AIC: | 5.889e+05 |
| Df Residuals: | 20799 | BIC: | 5.889e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 4.955e+05 | 2468.028 | 200.776 | 0.000 | 4.91e+05 | 5e+05 |
| x1 | 1.911e+05 | 3057.556 | 62.486 | 0.000 | 1.85e+05 | 1.97e+05 |
| Omnibus: | 17167.806 | Durbin-Watson: | 1.959 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 973094.774 |
| Skew: | 3.601 | Prob(JB): | 0.00 |
| Kurtosis: | 35.724 | Cond. No. | 1.45 |
#features - floors
x = data.iloc[:,[8]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.001 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.001 |
| Method: | Least Squares | F-statistic: | 20.63 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 5.60e-06 |
| Time: | 05:27:54 | Log-Likelihood: | -2.9624e+05 |
| No. Observations: | 20801 | AIC: | 5.925e+05 |
| Df Residuals: | 20799 | BIC: | 5.925e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 4.801e+05 | 1.36e+04 | 35.233 | 0.000 | 4.53e+05 | 5.07e+05 |
| x1 | 1.776e+04 | 3908.951 | 4.542 | 0.000 | 1.01e+04 | 2.54e+04 |
| Omnibus: | 18353.561 | Durbin-Watson: | 1.964 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1078313.052 |
| Skew: | 4.009 | Prob(JB): | 0.00 |
| Kurtosis: | 37.349 | Cond. No. | 20.0 |
#features - lat
x = data.iloc[:,[9]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.449 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.449 |
| Method: | Least Squares | F-statistic: | 1.692e+04 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:28:00 | Log-Likelihood: | -2.9006e+05 |
| No. Observations: | 20801 | AIC: | 5.801e+05 |
| Df Residuals: | 20799 | BIC: | 5.801e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -1.058e+06 | 1.24e+04 | -85.060 | 0.000 | -1.08e+06 | -1.03e+06 |
| x1 | 2.092e+05 | 1608.209 | 130.083 | 0.000 | 2.06e+05 | 2.12e+05 |
| Omnibus: | 19162.609 | Durbin-Watson: | 1.975 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1973458.532 |
| Skew: | 4.086 | Prob(JB): | 0.00 |
| Kurtosis: | 50.013 | Cond. No. | 51.2 |
#features - updated_age
x = data.iloc[:,[10]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.103 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.103 |
| Method: | Least Squares | F-statistic: | 2384. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:28:04 | Log-Likelihood: | -2.9512e+05 |
| No. Observations: | 20801 | AIC: | 5.902e+05 |
| Df Residuals: | 20799 | BIC: | 5.903e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 4.628e+05 | 2913.911 | 158.817 | 0.000 | 4.57e+05 | 4.68e+05 |
| x1 | 266.0069 | 5.449 | 48.822 | 0.000 | 255.327 | 276.686 |
| Omnibus: | 17245.218 | Durbin-Watson: | 1.950 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 843777.433 |
| Skew: | 3.686 | Prob(JB): | 0.00 |
| Kurtosis: | 33.318 | Cond. No. | 640. |
#features - updated_age
x = data.iloc[:,[11]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.098 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.098 |
| Method: | Least Squares | F-statistic: | 2255. |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 05:28:16 | Log-Likelihood: | -2.9518e+05 |
| No. Observations: | 20801 | AIC: | 5.904e+05 |
| Df Residuals: | 20799 | BIC: | 5.904e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -3.882e+07 | 8.29e+05 | -46.837 | 0.000 | -4.04e+07 | -3.72e+07 |
| x1 | 8.277e+05 | 1.74e+04 | 47.489 | 0.000 | 7.94e+05 | 8.62e+05 |
| Omnibus: | 19535.299 | Durbin-Watson: | 1.968 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1449685.117 |
| Skew: | 4.357 | Prob(JB): | 0.00 |
| Kurtosis: | 42.959 | Cond. No. | 1.62e+04 |
#features - updated_age
x = data.iloc[:,[12]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.000 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.000 |
| Method: | Least Squares | F-statistic: | 8.521 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00351 |
| Time: | 02:12:36 | Log-Likelihood: | -2.9625e+05 |
| No. Observations: | 20801 | AIC: | 5.925e+05 |
| Df Residuals: | 20799 | BIC: | 5.925e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 7.046e+06 | 2.23e+06 | 3.162 | 0.002 | 2.68e+06 | 1.14e+07 |
| x1 | 5.323e+04 | 1.82e+04 | 2.919 | 0.004 | 1.75e+04 | 8.9e+04 |
| Omnibus: | 18369.822 | Durbin-Watson: | 1.966 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1079256.989 |
| Skew: | 4.015 | Prob(JB): | 0.00 |
| Kurtosis: | 37.362 | Cond. No. | 1.06e+05 |
#features - updated_age
x = data.iloc[:,[13]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.345 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.345 |
| Method: | Least Squares | F-statistic: | 1.095e+04 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 0.00 |
| Time: | 02:12:38 | Log-Likelihood: | -2.9185e+05 |
| No. Observations: | 20801 | AIC: | 5.837e+05 |
| Df Residuals: | 20799 | BIC: | 5.837e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -9.226e+04 | 6399.748 | -14.416 | 0.000 | -1.05e+05 | -7.97e+04 |
| x1 | 316.0152 | 3.020 | 104.624 | 0.000 | 310.095 | 321.936 |
| Omnibus: | 19492.457 | Durbin-Watson: | 1.977 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1856666.629 |
| Skew: | 4.240 | Prob(JB): | 0.00 |
| Kurtosis: | 48.501 | Cond. No. | 6.52e+03 |
#features - updated_age
x = data.iloc[:,[14]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.007 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.007 |
| Method: | Least Squares | F-statistic: | 140.5 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 2.64e-32 |
| Time: | 02:12:41 | Log-Likelihood: | -2.9618e+05 |
| No. Observations: | 20801 | AIC: | 5.924e+05 |
| Df Residuals: | 20799 | BIC: | 5.924e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 5.265e+05 | 2836.476 | 185.620 | 0.000 | 5.21e+05 | 5.32e+05 |
| x1 | 1.0944 | 0.092 | 11.853 | 0.000 | 0.913 | 1.275 |
| Omnibus: | 18364.549 | Durbin-Watson: | 1.965 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1084425.463 |
| Skew: | 4.011 | Prob(JB): | 0.00 |
| Kurtosis: | 37.451 | Cond. No. | 3.40e+04 |
#features - updated_age
x = data.iloc[:,[15]].values
y = data.iloc[:,[0]].values
x= np.append(arr=np.ones((20801,1)).astype(int),values = x,axis = 1)
regressor_OLS=sm.OLS(endog=y,exog=x).fit()
regressor_OLS.summary()
| Dep. Variable: | y | R-squared: | 0.014 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.014 |
| Method: | Least Squares | F-statistic: | 295.3 |
| Date: | Sat, 25 Sep 2021 | Prob (F-statistic): | 1.00e-65 |
| Time: | 02:12:43 | Log-Likelihood: | -2.9610e+05 |
| No. Observations: | 20801 | AIC: | 5.922e+05 |
| Df Residuals: | 20799 | BIC: | 5.922e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 6.057e+05 | 4551.733 | 133.071 | 0.000 | 5.97e+05 | 6.15e+05 |
| x1 | -1537.6328 | 89.483 | -17.184 | 0.000 | -1713.026 | -1362.239 |
| Omnibus: | 18312.160 | Durbin-Watson: | 1.976 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1072663.894 |
| Skew: | 3.995 | Prob(JB): | 0.00 |
| Kurtosis: | 37.261 | Cond. No. | 90.7 |
#Dividing data into X and y variables
y_train = data.pop('price')
X_train = data
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
lm = LinearRegression()
lm.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
def build_model(X,y):
X = sm.add_constant(X) #Adding the constant
lm = sm.OLS(y,X).fit() # fitting the model
print(lm.summary()) # model summary
return X
def checkVIF(X):
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
return(vif)
X_train_new = build_model(X_train,y_train)
checkVIF(X_train_new)
X = pd.DataFrame(X_train)
y = pd.DataFrame(y_train)
model = sm.OLS(y, sm.add_constant(X))
model_fit = model.fit()
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.687
Model: OLS Adj. R-squared: 0.686
Method: Least Squares F-statistic: 3035.
Date: Sat, 25 Sep 2021 Prob (F-statistic): 0.00
Time: 02:12:56 Log-Likelihood: -2.8418e+05
No. Observations: 20801 AIC: 5.684e+05
Df Residuals: 20785 BIC: 5.685e+05
Df Model: 15
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -4.727e+07 1.59e+06 -29.814 0.000 -5.04e+07 -4.42e+07
bedrooms -3.746e+04 2092.043 -17.907 0.000 -4.16e+04 -3.34e+04
bathrooms 3.96e+04 3474.184 11.397 0.000 3.28e+04 4.64e+04
sqft_living 193.8176 3.929 49.330 0.000 186.117 201.519
sqft_lot 0.1354 0.049 2.738 0.006 0.038 0.232
floors -1.331e+04 4072.708 -3.268 0.001 -2.13e+04 -5327.913
waterfront 6.091e+05 1.81e+04 33.728 0.000 5.74e+05 6.44e+05
view 5.439e+04 2221.049 24.486 0.000 5e+04 5.87e+04
condition 3.072e+04 2426.242 12.660 0.000 2.6e+04 3.55e+04
grade 9.015e+04 2264.994 39.801 0.000 8.57e+04 9.46e+04
sqft_basement -38.6095 4.653 -8.298 0.000 -47.729 -29.490
lat 5.805e+05 1.09e+04 53.503 0.000 5.59e+05 6.02e+05
long -1.549e+05 1.24e+04 -12.501 0.000 -1.79e+05 -1.31e+05
sqft_living15 23.4865 3.576 6.569 0.000 16.478 30.495
sqft_lot15 -0.3894 0.076 -5.149 0.000 -0.538 -0.241
updated_age 2039.7298 74.964 27.209 0.000 1892.794 2186.666
==============================================================================
Omnibus: 17667.070 Durbin-Watson: 1.994
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1699605.539
Skew: 3.576 Prob(JB): 0.00
Kurtosis: 46.702 Cond. No. 5.68e+07
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.68e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
# create dataframe from X, y for easier plot handling
dataframe = pd.concat([X, y], axis=1)
# model values
model_fitted_y = model_fit.fittedvalues
# model residuals
model_residuals = model_fit.resid
# normalized residuals
model_norm_residuals = model_fit.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
# absolute residuals
model_abs_resid = np.abs(model_residuals)
# leverage, from statsmodels internals
model_leverage = model_fit.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = model_fit.get_influence().cooks_distance[0]
plot_lm_1 = plt.figure()
plot_lm_1.axes[0] = sns.residplot(model_fitted_y, dataframe.columns[-1], data=dataframe,
lowess=True,
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals');
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.687
Model: OLS Adj. R-squared: 0.686
Method: Least Squares F-statistic: 3035.
Date: Fri, 24 Sep 2021 Prob (F-statistic): 0.00
Time: 22:56:59 Log-Likelihood: -2.8418e+05
No. Observations: 20801 AIC: 5.684e+05
Df Residuals: 20785 BIC: 5.685e+05
Df Model: 15
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -4.727e+07 1.59e+06 -29.814 0.000 -5.04e+07 -4.42e+07
bedrooms -3.746e+04 2092.043 -17.907 0.000 -4.16e+04 -3.34e+04
bathrooms 3.96e+04 3474.184 11.397 0.000 3.28e+04 4.64e+04
sqft_living 193.8176 3.929 49.330 0.000 186.117 201.519
sqft_lot 0.1354 0.049 2.738 0.006 0.038 0.232
floors -1.331e+04 4072.708 -3.268 0.001 -2.13e+04 -5327.913
waterfront 6.091e+05 1.81e+04 33.728 0.000 5.74e+05 6.44e+05
view 5.439e+04 2221.049 24.486 0.000 5e+04 5.87e+04
condition 3.072e+04 2426.242 12.660 0.000 2.6e+04 3.55e+04
grade 9.015e+04 2264.994 39.801 0.000 8.57e+04 9.46e+04
sqft_basement -38.6095 4.653 -8.298 0.000 -47.729 -29.490
lat 5.805e+05 1.09e+04 53.503 0.000 5.59e+05 6.02e+05
long -1.549e+05 1.24e+04 -12.501 0.000 -1.79e+05 -1.31e+05
sqft_living15 23.4865 3.576 6.569 0.000 16.478 30.495
sqft_lot15 -0.3894 0.076 -5.149 0.000 -0.538 -0.241
updated_age 2039.7298 74.964 27.209 0.000 1892.794 2186.666
==============================================================================
Omnibus: 17667.070 Durbin-Watson: 1.994
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1699605.539
Skew: 3.576 Prob(JB): 0.00
Kurtosis: 46.702 Cond. No. 5.68e+07
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.68e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
| Features | VIF | |
|---|---|---|
| 0 | const | 1213169.32 |
| 3 | sqft_living | 6.35 |
| 9 | grade | 3.49 |
| 2 | bathrooms | 3.47 |
| 13 | sqft_living15 | 2.93 |
| 15 | updated_age | 2.21 |
| 14 | sqft_lot15 | 2.12 |
| 4 | sqft_lot | 2.10 |
| 10 | sqft_basement | 2.09 |
| 5 | floors | 2.04 |
| 1 | bedrooms | 1.69 |
| 12 | long | 1.47 |
| 7 | view | 1.42 |
| 8 | condition | 1.23 |
| 6 | waterfront | 1.20 |
| 11 | lat | 1.11 |
X_train_new = X_train_new.drop(["long"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.684
Model: OLS Adj. R-squared: 0.684
Method: Least Squares F-statistic: 3217.
Date: Sat, 25 Sep 2021 Prob (F-statistic): 0.00
Time: 02:13:40 Log-Likelihood: -2.8426e+05
No. Observations: 20801 AIC: 5.686e+05
Df Residuals: 20786 BIC: 5.687e+05
Df Model: 14
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -2.852e+07 5.15e+05 -55.340 0.000 -2.95e+07 -2.75e+07
bedrooms -3.763e+04 2099.798 -17.922 0.000 -4.17e+04 -3.35e+04
bathrooms 3.986e+04 3487.073 11.430 0.000 3.3e+04 4.67e+04
sqft_living 190.1095 3.932 48.345 0.000 182.402 197.817
sqft_lot 0.0880 0.050 1.779 0.075 -0.009 0.185
floors -9045.3031 4073.518 -2.221 0.026 -1.7e+04 -1060.889
waterfront 6.13e+05 1.81e+04 33.826 0.000 5.77e+05 6.49e+05
view 5.721e+04 2217.724 25.799 0.000 5.29e+04 6.16e+04
condition 2.883e+04 2430.564 11.860 0.000 2.41e+04 3.36e+04
grade 9.407e+04 2251.481 41.783 0.000 8.97e+04 9.85e+04
sqft_basement -28.2706 4.596 -6.151 0.000 -37.279 -19.263
lat 5.839e+05 1.09e+04 53.635 0.000 5.63e+05 6.05e+05
sqft_living15 13.8846 3.505 3.961 0.000 7.014 20.755
sqft_lot15 -0.4732 0.076 -6.258 0.000 -0.621 -0.325
updated_age 2337.9144 71.333 32.775 0.000 2198.096 2477.733
==============================================================================
Omnibus: 17805.209 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1714768.591
Skew: 3.624 Prob(JB): 0.00
Kurtosis: 46.886 Cond. No. 1.84e+07
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.84e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
| Features | VIF | |
|---|---|---|
| 0 | const | 127215.18 |
| 3 | sqft_living | 6.32 |
| 2 | bathrooms | 3.47 |
| 9 | grade | 3.42 |
| 12 | sqft_living15 | 2.79 |
| 13 | sqft_lot15 | 2.11 |
| 4 | sqft_lot | 2.08 |
| 5 | floors | 2.03 |
| 10 | sqft_basement | 2.02 |
| 14 | updated_age | 1.98 |
| 1 | bedrooms | 1.69 |
| 7 | view | 1.40 |
| 8 | condition | 1.22 |
| 6 | waterfront | 1.20 |
| 11 | lat | 1.11 |
X_train_new = X_train_new.drop(["sqft_lot"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.684
Model: OLS Adj. R-squared: 0.684
Method: Least Squares F-statistic: 3464.
Date: Sat, 25 Sep 2021 Prob (F-statistic): 0.00
Time: 02:14:23 Log-Likelihood: -2.8426e+05
No. Observations: 20801 AIC: 5.686e+05
Df Residuals: 20787 BIC: 5.687e+05
Df Model: 13
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -2.849e+07 5.15e+05 -55.310 0.000 -2.95e+07 -2.75e+07
bedrooms -3.773e+04 2099.180 -17.974 0.000 -4.18e+04 -3.36e+04
bathrooms 3.991e+04 3487.110 11.446 0.000 3.31e+04 4.67e+04
sqft_living 190.6164 3.922 48.599 0.000 182.929 198.304
floors -9299.5522 4071.221 -2.284 0.022 -1.73e+04 -1319.641
waterfront 6.124e+05 1.81e+04 33.797 0.000 5.77e+05 6.48e+05
view 5.732e+04 2216.983 25.857 0.000 5.3e+04 6.17e+04
condition 2.876e+04 2430.390 11.833 0.000 2.4e+04 3.35e+04
grade 9.41e+04 2251.540 41.795 0.000 8.97e+04 9.85e+04
sqft_basement -28.6705 4.591 -6.246 0.000 -37.668 -19.673
lat 5.832e+05 1.09e+04 53.603 0.000 5.62e+05 6.05e+05
sqft_living15 13.5670 3.501 3.875 0.000 6.705 20.429
sqft_lot15 -0.3792 0.054 -7.012 0.000 -0.485 -0.273
updated_age 2340.9640 71.316 32.825 0.000 2201.179 2480.749
==============================================================================
Omnibus: 17787.397 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1706676.767
Skew: 3.619 Prob(JB): 0.00
Kurtosis: 46.781 Cond. No. 1.10e+07
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.1e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
| Features | VIF | |
|---|---|---|
| 0 | const | 127026.19 |
| 3 | sqft_living | 6.28 |
| 2 | bathrooms | 3.47 |
| 8 | grade | 3.42 |
| 11 | sqft_living15 | 2.78 |
| 4 | floors | 2.03 |
| 9 | sqft_basement | 2.02 |
| 13 | updated_age | 1.98 |
| 1 | bedrooms | 1.68 |
| 6 | view | 1.40 |
| 7 | condition | 1.22 |
| 5 | waterfront | 1.20 |
| 10 | lat | 1.11 |
| 12 | sqft_lot15 | 1.08 |
X_train_new = X_train_new.drop(["condition"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.682
Model: OLS Adj. R-squared: 0.682
Method: Least Squares F-statistic: 3716.
Date: Sat, 25 Sep 2021 Prob (F-statistic): 0.00
Time: 02:15:53 Log-Likelihood: -2.8433e+05
No. Observations: 20801 AIC: 5.687e+05
Df Residuals: 20788 BIC: 5.688e+05
Df Model: 12
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -2.789e+07 5.14e+05 -54.236 0.000 -2.89e+07 -2.69e+07
bedrooms -3.664e+04 2104.156 -17.413 0.000 -4.08e+04 -3.25e+04
bathrooms 4.317e+04 3487.835 12.377 0.000 3.63e+04 5e+04
sqft_living 189.5590 3.934 48.181 0.000 181.847 197.271
floors -1.277e+04 4074.184 -3.135 0.002 -2.08e+04 -4786.848
waterfront 6.144e+05 1.82e+04 33.798 0.000 5.79e+05 6.5e+05
view 5.723e+04 2224.369 25.729 0.000 5.29e+04 6.16e+04
grade 9.475e+04 2258.382 41.957 0.000 9.03e+04 9.92e+04
sqft_basement -25.0629 4.596 -5.454 0.000 -34.071 -16.055
lat 5.724e+05 1.09e+04 52.622 0.000 5.51e+05 5.94e+05
sqft_living15 12.7671 3.512 3.636 0.000 5.884 19.650
sqft_lot15 -0.3697 0.054 -6.816 0.000 -0.476 -0.263
updated_age 2615.7708 67.654 38.664 0.000 2483.164 2748.378
==============================================================================
Omnibus: 17583.982 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1635514.781
Skew: 3.560 Prob(JB): 0.00
Kurtosis: 45.853 Cond. No. 1.09e+07
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.09e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
| Features | VIF | |
|---|---|---|
| 0 | const | 125827.33 |
| 3 | sqft_living | 6.28 |
| 2 | bathrooms | 3.45 |
| 7 | grade | 3.42 |
| 10 | sqft_living15 | 2.78 |
| 4 | floors | 2.02 |
| 8 | sqft_basement | 2.01 |
| 12 | updated_age | 1.77 |
| 1 | bedrooms | 1.68 |
| 6 | view | 1.40 |
| 5 | waterfront | 1.20 |
| 9 | lat | 1.10 |
| 11 | sqft_lot15 | 1.08 |
data.columns
Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
'waterfront', 'view', 'condition', 'grade', 'sqft_basement', 'lat',
'long', 'sqft_living15', 'sqft_lot15', 'updated_age'],
dtype='object')
X_train_new = X_train_new.drop(["sqft_lot15"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.681
Model: OLS Adj. R-squared: 0.681
Method: Least Squares F-statistic: 4040.
Date: Sat, 25 Sep 2021 Prob (F-statistic): 0.00
Time: 02:16:36 Log-Likelihood: -2.8436e+05
No. Observations: 20801 AIC: 5.687e+05
Df Residuals: 20789 BIC: 5.688e+05
Df Model: 11
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -2.817e+07 5.13e+05 -54.878 0.000 -2.92e+07 -2.72e+07
bedrooms -3.532e+04 2097.472 -16.837 0.000 -3.94e+04 -3.12e+04
bathrooms 4.385e+04 3490.236 12.562 0.000 3.7e+04 5.07e+04
sqft_living 185.2078 3.886 47.655 0.000 177.590 192.826
floors -1.005e+04 4058.971 -2.476 0.013 -1.8e+04 -2093.117
waterfront 6.148e+05 1.82e+04 33.782 0.000 5.79e+05 6.5e+05
view 5.69e+04 2226.258 25.557 0.000 5.25e+04 6.13e+04
grade 9.553e+04 2257.995 42.306 0.000 9.11e+04 1e+05
sqft_basement -22.0609 4.579 -4.817 0.000 -31.037 -13.085
lat 5.779e+05 1.09e+04 53.222 0.000 5.57e+05 5.99e+05
sqft_living15 11.6169 3.512 3.308 0.001 4.734 18.500
updated_age 2631.6736 67.688 38.880 0.000 2499.000 2764.347
==============================================================================
Omnibus: 17681.398 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1675898.494
Skew: 3.586 Prob(JB): 0.00
Kurtosis: 46.384 Cond. No. 1.10e+06
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.1e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
| Features | VIF | |
|---|---|---|
| 0 | const | 125057.28 |
| 3 | sqft_living | 6.11 |
| 2 | bathrooms | 3.44 |
| 7 | grade | 3.41 |
| 10 | sqft_living15 | 2.78 |
| 4 | floors | 2.00 |
| 8 | sqft_basement | 1.99 |
| 11 | updated_age | 1.77 |
| 1 | bedrooms | 1.67 |
| 6 | view | 1.40 |
| 5 | waterfront | 1.20 |
| 9 | lat | 1.10 |
X_train_new = X_train_new.drop(["updated_age"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.658
Model: OLS Adj. R-squared: 0.658
Method: Least Squares F-statistic: 4002.
Date: Sat, 25 Sep 2021 Prob (F-statistic): 0.00
Time: 02:18:22 Log-Likelihood: -2.8509e+05
No. Observations: 20801 AIC: 5.702e+05
Df Residuals: 20790 BIC: 5.703e+05
Df Model: 10
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const -3.235e+07 5.2e+05 -62.229 0.000 -3.34e+07 -3.13e+07
bedrooms -2.961e+04 2167.016 -13.665 0.000 -3.39e+04 -2.54e+04
bathrooms -2683.0388 3395.683 -0.790 0.429 -9338.842 3972.765
sqft_living 198.1978 4.010 49.423 0.000 190.337 206.058
floors -3.409e+04 4154.788 -8.205 0.000 -4.22e+04 -2.59e+04
waterfront 6.078e+05 1.88e+04 32.249 0.000 5.71e+05 6.45e+05
view 6.762e+04 2287.952 29.555 0.000 6.31e+04 7.21e+04
grade 8.088e+04 2305.804 35.076 0.000 7.64e+04 8.54e+04
sqft_basement -4.6312 4.720 -0.981 0.327 -13.883 4.621
lat 6.725e+05 1.1e+04 61.351 0.000 6.51e+05 6.94e+05
sqft_living15 4.1177 3.631 1.134 0.257 -3.000 11.236
==============================================================================
Omnibus: 17310.306 Durbin-Watson: 1.991
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1453782.513
Skew: 3.503 Prob(JB): 0.00
Kurtosis: 43.352 Cond. No. 1.08e+06
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.08e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
| Features | VIF | |
|---|---|---|
| 0 | const | 119572.12 |
| 3 | sqft_living | 6.07 |
| 7 | grade | 3.31 |
| 2 | bathrooms | 3.04 |
| 10 | sqft_living15 | 2.77 |
| 8 | sqft_basement | 1.97 |
| 4 | floors | 1.95 |
| 1 | bedrooms | 1.66 |
| 6 | view | 1.38 |
| 5 | waterfront | 1.20 |
| 9 | lat | 1.04 |
model_fitted_y = model_fit.fittedvalues
model_residuals = model_fit.resid
# normalized residuals
model_norm_residuals = model_fit.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
# absolute residuals
model_abs_resid = np.abs(model_residuals)
# leverage, from statsmodels internals
model_leverage = model_fit.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = model_fit.get_influence().cooks_distance[0]
plot_lm_1 = plt.figure()
plot_lm_1.axes[0] = sns.residplot(model_fitted_y, dataframe.columns[-1], data=dataframe,
lowess=True,
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals');
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
list(0.1,0.28,0.494,0.008,0.082,0.072,0.158,0.001,0.449,0.13,0.098,0.000,0.345,0.007,0.014)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-22-bd72e5580686> in <module>() ----> 1 list(0.1,0.28,0.494,0.008,0.082,0.072,0.158,0.001,0.449,0.13,0.098,0.000,0.345,0.007,0.014) TypeError: list expected at most 1 arguments, got 15